Exploring Market Basket Analysis in Istanbul Retail Data
Load the packages
library(ggplot2)
library(dplyr)
library(plotly)
library(pacman)
p_load(ggplot2, data.table, rgl, summarytools, GGally, factoextra, cluster, tidyverse)
library(summarytools)
print("Required Packages are loaded" )
## [1] "Required Packages are loaded"
Read the data set
sales <- read.csv('C:\\Users\\LENOVO\\Downloads\\customer_shopping_data.csv')
head(sales)
## invoice_no customer_id gender age category quantity price payment_method
## 1 I138884 C241288 Female 28 Clothing 5 1500.40 Credit Card
## 2 I317333 C111565 Male 21 Shoes 3 1800.51 Debit Card
## 3 I127801 C266599 Male 20 Clothing 1 300.08 Cash
## 4 I173702 C988172 Female 66 Shoes 5 3000.85 Credit Card
## 5 I337046 C189076 Female 53 Books 4 60.60 Cash
## 6 I227836 C657758 Female 28 Clothing 5 1500.40 Credit Card
## invoice_date shopping_mall
## 1 5/8/2022 Kanyon
## 2 12/12/2021 Forum Istanbul
## 3 9/11/2021 Metrocity
## 4 16/05/2021 Metropol AVM
## 5 24/10/2021 Kanyon
## 6 24/05/2022 Forum Istanbul
Statistics of the Dataset
summary(sales)
## invoice_no customer_id gender age
## Length:99457 Length:99457 Length:99457 Min. :18.00
## Class :character Class :character Class :character 1st Qu.:30.00
## Mode :character Mode :character Mode :character Median :43.00
## Mean :43.43
## 3rd Qu.:56.00
## Max. :69.00
## category quantity price payment_method
## Length:99457 Min. :1.000 Min. : 5.23 Length:99457
## Class :character 1st Qu.:2.000 1st Qu.: 45.45 Class :character
## Mode :character Median :3.000 Median : 203.30 Mode :character
## Mean :3.003 Mean : 689.26
## 3rd Qu.:4.000 3rd Qu.:1200.32
## Max. :5.000 Max. :5250.00
## invoice_date shopping_mall
## Length:99457 Length:99457
## Class :character Class :character
## Mode :character Mode :character
##
##
##
str(sales)
## 'data.frame': 99457 obs. of 10 variables:
## $ invoice_no : chr "I138884" "I317333" "I127801" "I173702" ...
## $ customer_id : chr "C241288" "C111565" "C266599" "C988172" ...
## $ gender : chr "Female" "Male" "Male" "Female" ...
## $ age : int 28 21 20 66 53 28 49 32 69 60 ...
## $ category : chr "Clothing" "Shoes" "Clothing" "Shoes" ...
## $ quantity : int 5 3 1 5 4 5 1 2 3 2 ...
## $ price : num 1500.4 1800.5 300.1 3000.8 60.6 ...
## $ payment_method: chr "Credit Card" "Debit Card" "Cash" "Credit Card" ...
## $ invoice_date : chr "5/8/2022" "12/12/2021" "9/11/2021" "16/05/2021" ...
## $ shopping_mall : chr "Kanyon" "Forum Istanbul" "Metrocity" "Metropol AVM" ...
print(dfSummary(sales), method = 'render')
Exploratory Data Analysis
Distribution of Age
ggplot(sales,
aes(x = age,
fill =gender)) +
geom_density(alpha = 0.5) +
labs(title = "Distribution of Age")

Let’s know who shop more?
ggplot(sales, aes(x = gender)) +
geom_bar(fill = "cornflowerblue",
color="black") +
labs(title = " Total no.of shoppers", x = "Gender")

Preferred Categories by Gender
ggplot(sales, aes(x = quantity, y = price,color=category)) +
geom_point(size=2) + geom_smooth() + facet_grid(gender~.)

labs(title = "Preferred Categories by Gender", x = "Quantity", y = "Price")
## $x
## [1] "Quantity"
##
## $y
## [1] "Price"
##
## $title
## [1] "Preferred Categories by Gender"
##
## attr(,"class")
## [1] "labels"
Average Price for each category
ggplot(sales, aes(x = category, y = price, fill = category)) +
geom_boxplot() +
stat_summary(fun = "mean", geom = "point", shape = 18, size = 3, color = "white") +
labs(title = "Average Price by Category", x = "Category", y = "Price") +
theme(legend.position = "none")

Total Sales of each Category
# group the data by category and sum the quantity and price
category_sales <- aggregate(cbind(quantity, price) ~ category, data = sales, FUN = sum)
# calculate the total sales for each category
category_sales$total_sales <- category_sales$quantity * category_sales$price
# sort the categories by total sales (descending order)
category_sales <- category_sales[order(category_sales$total_sales, decreasing = TRUE),]
# print the top 5 categories by total sales
head(category_sales, n = 5)
## category quantity price total_sales
## 2 Clothing 103558 31075685 3.218136e+12
## 5 Shoes 30217 18135337 5.479955e+11
## 7 Technology 15021 15772050 2.369120e+11
## 3 Cosmetics 45465 1848607 8.404691e+10
## 8 Toys 30321 1086705 3.294997e+10
# create an interactive scatter plot of category vs. total sales
gg <- ggplot(data = category_sales, aes(x = category, y = total_sales, text = paste("Category: ", category, "<br>Total Sales: $", total_sales))) +
geom_point() +
labs(title = "Category vs. Total Sales",
x = "Category",
y = "Total Sales") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# convert the ggplot object to a plotly object and add tooltips
ggplotly(gg, tooltip = c("text"))
Smooth Layer with regression
sales$year1 <- factor(sales$year)
ggplot(sales, aes(x = age, y = price , color=year1)) +
geom_smooth(method = "lm") +
labs(title = "Linear Regression Plot of Age vs. Price",
x = "Age",
y = "Price")

Clustering
# subset the data to only include quantity and price columns
data <- subset(sales, select = c(quantity, price))
# standardize the data to have zero mean and unit variance
data_scaled <- scale(data)
# perform k-means clustering with k = 3
set.seed(123)
kmeans_obj <- kmeans(data_scaled, centers = 3)
# add cluster labels to the original data
sales$cluster <- kmeans_obj$cluster
# create a scatter plot with ggplot
p <- ggplot(sales, aes(x = quantity, y = price, color = factor(cluster))) +
geom_point() +
labs(title = "K-Means Clustering Results",
x = "Quantity",
y = "Price",
color = "Cluster") +
theme_minimal()
# convert ggplot to plotly and make it interactive
p <- ggplotly(p, tooltip = c("text"))
p <- layout(p, title = "K-Means Clustering Results")
p
Clustering in the high-dimensional space of the data
#subset the data to only include age and price columns
data <- subset(sales, select = c(age, price))
# standardize the data to have zero mean and unit variance
data_scaled <- scale(data)
# perform k-means clustering with k = 3
set.seed(123)
kmeans_obj <- kmeans(data_scaled, centers = 3)
# add cluster labels to the original data
sales$cluster <- kmeans_obj$cluster
# compute PCA for data visualization
pca <- prcomp(data_scaled)
# plot the first two principal components colored by cluster
fviz_pca_ind(pca, geom = "point", habillage = sales$cluster,
ggtheme = theme_minimal())
